{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Programming Exercise 6: Support Vector Machines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.io #Used to load the OCTAVE *.mat files\n",
    "from sklearn import svm #SVM software\n",
    "import re #regular expression for e-mail processing\n",
    "\n",
    "# This is one possible porter stemmer \n",
    "# (note: I had to do a pip install stemming)\n",
    "# https://pypi.python.org/pypi/stemming/1.0\n",
    "from stemming.porter2 import stem\n",
    "\n",
    "# This porter stemmer seems to more accurately duplicate the\n",
    "# porter stemmer used in the OCTAVE assignment code\n",
    "# (note: I had to do a pip install nltk)\n",
    "# I'll note that both stemmers have very similar results\n",
    "import nltk, nltk.stem.porter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2 Spam Classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.1 Preprocessing Emails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "emailSample1.txt:\n",
      "> Anyone knows how much it costs to host a web portal ?\r\n",
      ">\r\n",
      "Well, it depends on how many visitors you're expecting.\r\n",
      "This can be anywhere from less than 10 bucks a month to a couple of $100. \r\n",
      "You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 \r\n",
      "if youre running something big..\r\n",
      "\r\n",
      "To unsubscribe yourself from this mailing list, send an email to:\r\n",
      "groupname-unsubscribe@egroups.com\r\n",
      "\r\n"
     ]
    }
   ],
   "source": [
    "print \"emailSample1.txt:\"\n",
    "!cat data/emailSample1.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def preProcess( email ):\n",
    "    \"\"\"\n",
    "    Function to do some pre processing (simplification of e-mails).\n",
    "    Comments throughout implementation describe what it does.\n",
    "    Input = raw e-mail\n",
    "    Output = processed (simplified) email\n",
    "    \"\"\"\n",
    "    # Make the entire e-mail lower case\n",
    "    email = email.lower()\n",
    "    \n",
    "    # Strip html tags (strings that look like <blah> where 'blah' does not\n",
    "    # contain '<' or '>')... replace with a space\n",
    "    email = re.sub('<[^<>]+>', ' ', email);\n",
    "    \n",
    "    #Any numbers get replaced with the string 'number'\n",
    "    email = re.sub('[0-9]+', 'number', email)\n",
    "    \n",
    "    #Anything starting with http or https:// replaced with 'httpaddr'\n",
    "    email = re.sub('(http|https)://[^\\s]*', 'httpaddr', email)\n",
    "    \n",
    "    #Strings with \"@\" in the middle are considered emails --> 'emailaddr'\n",
    "    email = re.sub('[^\\s]+@[^\\s]+', 'emailaddr', email);\n",
    "    \n",
    "    #The '$' sign gets replaced with 'dollar'\n",
    "    email = re.sub('[$]+', 'dollar', email);\n",
    "    \n",
    "    return email"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def email2TokenList( raw_email ):\n",
    "    \"\"\"\n",
    "    Function that takes in preprocessed (simplified) email, tokenizes it,\n",
    "    stems each word, and returns an (ordered) list of tokens in the e-mail\n",
    "    \"\"\"\n",
    "    \n",
    "    # I'll use the NLTK stemmer because it more accurately duplicates the\n",
    "    # performance of the OCTAVE implementation in the assignment\n",
    "    stemmer = nltk.stem.porter.PorterStemmer()\n",
    "    \n",
    "    email = preProcess( raw_email )\n",
    "\n",
    "    #Split the e-mail into individual words (tokens) (split by the delimiter ' ')\n",
    "    #but also split by delimiters '@', '$', '/', etc etc\n",
    "    #Splitting by many delimiters is easiest with re.split()\n",
    "    tokens = re.split('[ \\@\\$\\/\\#\\.\\-\\:\\&\\*\\+\\=\\[\\]\\?\\!\\(\\)\\{\\}\\,\\'\\\"\\>\\_\\<\\;\\%]', email)\n",
    "    \n",
    "    #Loop over each word (token) and use a stemmer to shorten it,\n",
    "    #then check if the word is in the vocab_list... if it is,\n",
    "    #store what index in the vocab_list the word is\n",
    "    tokenlist = []\n",
    "    for token in tokens:\n",
    "        \n",
    "        #Remove any non alphanumeric characters\n",
    "        token = re.sub('[^a-zA-Z0-9]', '', token);\n",
    "\n",
    "        #Use the Porter stemmer to stem the word\n",
    "        stemmed = stemmer.stem( token )\n",
    "        \n",
    "        #Throw out empty tokens\n",
    "        if not len(token): continue\n",
    "            \n",
    "        #Store a list of all unique stemmed words\n",
    "        tokenlist.append(stemmed)\n",
    "            \n",
    "    return tokenlist"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "##### 2.1.1 Vocabulary List"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def getVocabDict(reverse=False):\n",
    "    \"\"\"\n",
    "    Function to read in the supplied vocab list text file into a dictionary.\n",
    "    I'll use this for now, but since I'm using a slightly different stemmer,\n",
    "    I'd like to generate this list myself from some sort of data set...\n",
    "    Dictionary key is the stemmed word, value is the index in the text file\n",
    "    If \"reverse\", the keys and values are switched.\n",
    "    \"\"\"\n",
    "    vocab_dict = {}\n",
    "    with open(\"data/vocab.txt\") as f:\n",
    "        for line in f:\n",
    "            (val, key) = line.split()\n",
    "            if not reverse:\n",
    "                vocab_dict[key] = int(val)\n",
    "            else:\n",
    "                vocab_dict[int(val)] = key\n",
    "                \n",
    "    return vocab_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def email2VocabIndices( raw_email, vocab_dict ):\n",
    "    \"\"\"\n",
    "    Function that takes in a raw email and returns a list of indices corresponding\n",
    "    to the location in vocab_dict for each stemmed word in the email.\n",
    "    \"\"\"\n",
    "    tokenlist = email2TokenList( raw_email )\n",
    "    index_list = [ vocab_dict[token] for token in tokenlist if token in vocab_dict ]\n",
    "    return index_list"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.2 Extracting Features from Emails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def email2FeatureVector( raw_email, vocab_dict ):\n",
    "    \"\"\"\n",
    "    Function that takes as input a raw email, and returns a vector of shape\n",
    "    (n,1) where n is the size of the vocab_dict.\n",
    "    The first element in this vector is 1 if the vocab word with index == 1\n",
    "    is in the raw_email, 0 otherwise.\n",
    "    \"\"\"\n",
    "    n = len(vocab_dict)\n",
    "    result = np.zeros((n,1))\n",
    "    vocab_indices = email2VocabIndices( email_contents, vocab_dict )\n",
    "    for idx in vocab_indices:\n",
    "        result[idx] = 1\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of feature vector is 1899\n",
      "Number of non-zero entries is: 45\n"
     ]
    }
   ],
   "source": [
    "# \" ... run your code on the email sample. You should see that the feature vector \n",
    "# has length 1899 and 45 non-zero entries.\"\n",
    "\n",
    "vocab_dict = getVocabDict()\n",
    "email_contents = open( 'data/emailSample1.txt', 'r' ).read()\n",
    "test_fv = email2FeatureVector( email_contents, vocab_dict )\n",
    "\n",
    "print \"Length of feature vector is %d\" % len(test_fv)\n",
    "print \"Number of non-zero entries is: %d\" % sum(test_fv==1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.3 Training SVM for Spam Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Read in the training set and test set provided\n",
    "# Note the feature vectors correspond to the stemming implementation\n",
    "# done in the OCTAVE code... which may be different than mine.\n",
    "\n",
    "# Training set\n",
    "datafile = 'data/spamTrain.mat'\n",
    "mat = scipy.io.loadmat( datafile )\n",
    "X, y = mat['X'], mat['y']\n",
    "#NOT inserting a column of 1's in case SVM software does it for me automatically...\n",
    "#X =     np.insert(X    ,0,1,axis=1)\n",
    "\n",
    "# Test set\n",
    "datafile = 'data/spamTest.mat'\n",
    "mat = scipy.io.loadmat( datafile )\n",
    "Xtest, ytest = mat['Xtest'], mat['ytest']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of training emails =  4000\n",
      "Number of training spam emails =  1277\n",
      "Number of training nonspam emails =  2723\n"
     ]
    }
   ],
   "source": [
    "pos = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 1])\n",
    "neg = np.array([X[i] for i in xrange(X.shape[0]) if y[i] == 0])\n",
    "print 'Total number of training emails = ',X.shape[0]\n",
    "print 'Number of training spam emails = ',pos.shape[0]\n",
    "print 'Number of training nonspam emails = ',neg.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n",
       "  kernel='linear', max_iter=-1, probability=False, random_state=None,\n",
       "  shrinking=True, tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Run the SVM training (with C = 0.1) using SVM software. \n",
    "\n",
    "# First we make an instance of an SVM with C=0.1 and 'linear' kernel\n",
    "linear_svm = svm.SVC(C=0.1, kernel='linear')\n",
    "\n",
    "# Now we fit the SVM to our X matrix, given the labels y\n",
    "linear_svm.fit( X, y.flatten() )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training accuracy = 99.83%\n",
      "Test set accuracy = 98.90%\n"
     ]
    }
   ],
   "source": [
    "# \"Once the training completes, you should see that the classifier gets a \n",
    "#  training accuracy of about 99.8% and a test accuracy of about 98.5%\"\n",
    "\n",
    "train_predictions = linear_svm.predict(X).reshape((y.shape[0],1))\n",
    "train_acc = 100. * float(sum(train_predictions == y))/y.shape[0]\n",
    "print 'Training accuracy = %0.2f%%' % train_acc\n",
    "\n",
    "test_predictions = linear_svm.predict(Xtest).reshape((ytest.shape[0],1))\n",
    "test_acc = 100. * float(sum(test_predictions == ytest))/ytest.shape[0]\n",
    "print 'Test set accuracy = %0.2f%%' % test_acc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.4 Top Predictors for Spam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The 15 most important words to classify a spam e-mail are:\n",
      "['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'mortgag', 'natur', 'll', 'futur', 'hot']\n",
      "\n",
      "The 15 least important words to classify a spam e-mail are:\n",
      "['http', 'toll', 'xp', 'ratio', 'august', 'unsubscrib', 'useless', 'numberth', 'round', 'linux', 'datapow', 'wrong', 'urgent', 'that', 'spam']\n",
      "\n",
      "# of spam containing \"otherwis\" = 804/1277 = 62.96%\n",
      "# of NON spam containing \"otherwis\" = 301/2723 = 11.05%\n"
     ]
    }
   ],
   "source": [
    "# Determine the words most likely to indicate an e-mail is a spam\n",
    "# From the trained SVM we can get a list of the weight coefficients for each\n",
    "# word (technically, each word index)\n",
    "\n",
    "vocab_dict_flipped = getVocabDict(reverse=True)\n",
    "\n",
    "#Sort indicies from most important to least-important (high to low weight)\n",
    "sorted_indices = np.argsort( linear_svm.coef_, axis=None )[::-1]\n",
    "print \"The 15 most important words to classify a spam e-mail are:\"\n",
    "print [ vocab_dict_flipped[x] for x in sorted_indices[:15] ]\n",
    "print\n",
    "print \"The 15 least important words to classify a spam e-mail are:\"\n",
    "print [ vocab_dict_flipped[x] for x in sorted_indices[-15:] ]\n",
    "print\n",
    "\n",
    "# Most common word (mostly to debug):\n",
    "most_common_word = vocab_dict_flipped[sorted_indices[0]]\n",
    "print '# of spam containing \\\"%s\\\" = %d/%d = %0.2f%%'% \\\n",
    "    (most_common_word, sum(pos[:,1190]),pos.shape[0],  \\\n",
    "     100.*float(sum(pos[:,1190]))/pos.shape[0])\n",
    "print '# of NON spam containing \\\"%s\\\" = %d/%d = %0.2f%%'% \\\n",
    "    (most_common_word, sum(neg[:,1190]),neg.shape[0],      \\\n",
    "     100.*float(sum(neg[:,1190]))/neg.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Note my SVM gets some different predictor words for spam than shown in the\n",
    "# assignment PDF... I've done debugging and I'm confident it's due to a different\n",
    "# SVM software package, not because of a bug or something in my code.\n",
    "\n",
    "# Also note the optional exercises \"Try your own emails\" and \"Build your own\n",
    "# dataset\" I will be doing seperately in a blog post... Check out\n",
    "# blog.davidkaleko.com/svm-email-filter-implementation.html to have a look!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
